Module 2 R Practice

1. Reading the dataset into R

property <- read.csv("fy2023-property-assessment-data.csv")

2. Cleaning the dataset

  • Importing Libraries

library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(knitr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
  • Cleaning the col names

property <- clean_names(property)
  • replacing outliers

#creating a function to fiter outliers
filter_outliers <- function(x) {
  quantile1 <- quantile(x, 0.25, na.rm = TRUE)
  quantile3 <- quantile(x, 0.75, na.rm = TRUE)
  iqr_value <- IQR(x, na.rm = TRUE)
  
  lower_values <- quantile1 - 1.5 * iqr_value
  upper_values <- quantile3 + 1.5 * iqr_value
  
  outliers_removed <- ifelse(x < lower_values | x > upper_values, NA, x)
  x_cleaned <- ifelse(is.na(outliers_removed), median(x, na.rm = TRUE), outliers_removed) 
  
                    return(x_cleaned)
}

# Clean all numeric columns in the dataset
property_df_cleaned <- property %>%
  mutate(across(where(is.numeric), filter_outliers))

# Remove rows with any NA values after cleaning
property_df_cleaned <-na.omit(property_df_cleaned)
  • Extracting useful cols

property_extract<- subset(property_df_cleaned, select= 
                        c(city, bldg_type, num_bldgs ,res_units, bldg_value, land_sf,land_value, total_value, gross_tax, yr_built, yr_remodel, structure_class, roof_structure, bed_rms, full_bth, lu, ext_fnished, heat_type, prop_view, ac_type ))
  • Viewing data type

str(property_extract)
## 'data.frame':    180627 obs. of  20 variables:
##  $ city           : chr  "EAST BOSTON" "EAST BOSTON" "EAST BOSTON" "EAST BOSTON" ...
##  $ bldg_type      : chr  "RE - Row End" "RM - Row Middle" "RM - Row Middle" "RM - Row Middle" ...
##  $ num_bldgs      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ res_units      : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ bldg_value     : int  588500 615300 599200 530200 569100 1027100 993200 876500 613100 802800 ...
##  $ land_sf        : num  1150 1150 1150 1150 2010 2500 2500 2500 2500 2500 ...
##  $ land_value     : int  195700 196500 197100 197700 225700 261200 262000 262700 263200 224400 ...
##  $ total_value    : int  784200 811800 796300 727900 794800 1288300 1255200 1139200 876300 1027200 ...
##  $ gross_tax      : num  8422 8719 8552 7818 8536 ...
##  $ yr_built       : int  1900 1920 1905 1900 1900 1900 1900 1900 1900 1900 ...
##  $ yr_remodel     : num  2004 2000 1985 1991 1978 ...
##  $ structure_class: chr  "" "" "" "" ...
##  $ roof_structure : chr  "F - Flat" "F - Flat" "F - Flat" "M - Mansard" ...
##  $ bed_rms        : num  6 3 5 5 6 3 3 3 5 6 ...
##  $ full_bth       : int  3 3 3 3 3 1 1 3 3 3 ...
##  $ lu             : chr  "R3" "R3" "R3" "R3" ...
##  $ ext_fnished    : chr  "A - Asbestos" "M - Vinyl" "M - Vinyl" "M - Vinyl" ...
##  $ heat_type      : chr  "W - Ht Water/Steam" "F - Forced Hot Air" "S - Space Heat" "W - Ht Water/Steam" ...
##  $ prop_view      : chr  "A - Average" "A - Average" "A - Average" "A - Average" ...
##  $ ac_type        : chr  "N - None" "C - Central AC" "N - None" "N - None" ...

PART 1 - Descriptive Analysis

  • Descriptive Statistics for df grouped by CITY

# Used summary function to group by City
city_grouped <- property_extract %>%
                         group_by(city) %>%
                         summarise(across(where(is.numeric), 
                        list(
    Mean = ~ mean(., na.rm = TRUE),
    SD = ~ sd(., na.rm = TRUE),
    Min = ~ min(., na.rm = TRUE),
    Max = ~ max(., na.rm = TRUE),
    Total = ~ sum(!is.na(.))
  )))

# Replaced blank values with 'UNKNOWN'
city_grouped$city <- ifelse(city_grouped$city == "","UNKNOWN",  city_grouped$city)

# Used kable function to create table
kable(city_grouped, 
               caption= "Descriptive Statistics for table grouped by CITY",
               format = "html") %>%
  kable_styling(full_width = FALSE)
Descriptive Statistics for table grouped by CITY
city num_bldgs_Mean num_bldgs_SD num_bldgs_Min num_bldgs_Max num_bldgs_Total res_units_Mean res_units_SD res_units_Min res_units_Max res_units_Total bldg_value_Mean bldg_value_SD bldg_value_Min bldg_value_Max bldg_value_Total land_sf_Mean land_sf_SD land_sf_Min land_sf_Max land_sf_Total land_value_Mean land_value_SD land_value_Min land_value_Max land_value_Total total_value_Mean total_value_SD total_value_Min total_value_Max total_value_Total gross_tax_Mean gross_tax_SD gross_tax_Min gross_tax_Max gross_tax_Total yr_built_Mean yr_built_SD yr_built_Min yr_built_Max yr_built_Total yr_remodel_Mean yr_remodel_SD yr_remodel_Min yr_remodel_Max yr_remodel_Total bed_rms_Mean bed_rms_SD bed_rms_Min bed_rms_Max bed_rms_Total full_bth_Mean full_bth_SD full_bth_Min full_bth_Max full_bth_Total
UNKNOWN 1 0 1 1 5 3.000000 0.0000000 3 3 5 332760.0 293463.8 7500 646900 5 4562.600 3430.438 2014 8320 5 0.00 0.00 0 0 5 501340.0 263423.7 31000 646900 5 7136.398 5603.741 332.940 15965.490 5 1965.200 43.66578 1920 2007 5 2006.200 4.919350 2004 2015 5 3.400000 0.8944272 3 5 5 0.400000 0.5477226 0 1 5
ALLSTON 1 0 1 1 4423 3.012661 0.2802447 2 9 4423 520738.6 279717.9 0 1413700 4423 2509.162 2197.625 100 10388 4423 104565.70 157528.91 0 561236 4423 635154.2 376256.4 0 1714500 4423 6680.372 4008.301 0.000 18453.467 4423 1931.358 35.22143 1812 2021 4423 1998.340 12.289278 1945 2021 4423 2.773231 1.4335540 0 7 4423 1.172055 0.8496236 0 3 4423
BOSTON 1 0 1 1 47104 3.060674 0.4811500 1 9 47104 543121.5 347347.8 0 1414100 47104 1491.213 1221.085 101 10420 47104 16799.13 79888.56 0 561600 47104 623043.3 395098.0 0 1715700 47104 6447.484 4374.329 0.000 18454.537 47104 1930.113 46.96075 1805 2022 47104 2000.898 12.582038 1945 2022 47104 2.134299 1.0470172 0 7 47104 1.017918 0.8000923 0 3 47104
BRIGHTON 1 0 1 1 11870 3.005392 0.2659766 1 9 11870 484036.3 255687.5 0 1413700 11870 2543.282 2338.322 113 10405 11870 107595.26 154279.18 0 562500 11870 594385.0 358777.3 0 1715200 11870 6280.716 3906.149 0.000 18452.390 11870 1936.164 32.97777 1826 2022 11870 1999.266 11.458126 1950 2022 11870 2.722915 1.5728890 0 7 11870 1.271609 0.7776868 0 3 11870
BROOKLINE 1 0 1 1 24 3.000000 0.0000000 3 3 24 691437.5 327954.3 0 1201800 24 4048.708 1706.775 881 7000 24 310387.50 201782.56 0 518100 24 983700.0 506916.4 0 1688100 24 9999.924 5828.865 0.000 18130.190 24 1919.583 12.51752 1900 1955 24 1998.375 13.044447 1979 2021 24 3.791667 1.2503623 2 7 24 1.250000 1.0320937 0 3 24
CHARLESTOWN 1 0 1 1 7230 2.996957 0.3609698 1 9 7230 555500.1 371028.2 0 1413900 7230 1433.702 1075.834 106 10386 7230 78861.19 145839.93 0 562500 7230 653957.4 431568.4 0 1714300 7230 6839.392 4737.971 0.000 18438.430 7230 1920.288 51.44167 1806 2022 7230 2004.248 9.194087 1950 2022 7230 2.521715 1.0546087 0 7 7230 1.280636 0.8441731 0 3 7230
CHESTNUT HILL 1 0 1 1 1017 3.000000 0.0000000 3 3 1017 419061.6 235049.3 0 1409900 1017 1834.948 1950.346 223 9944 1017 33939.33 91950.68 0 553400 1017 520106.3 364876.0 0 1657900 1017 5546.289 3924.105 0.000 17805.847 1017 1964.448 23.66984 1860 2014 1017 2001.299 9.463415 1978 2021 1017 1.965585 0.8322933 0 7 1017 1.314651 0.5913681 0 3 1017
DEDHAM 1 0 1 1 6 3.000000 0.0000000 3 3 6 316366.7 304932.0 0 822800 6 4680.667 2785.488 1330 8228 6 166383.33 120591.03 12700 281400 6 482750.0 404693.5 12700 1058400 6 5184.733 4346.408 136.400 11367.217 6 1944.167 31.37143 1920 2005 6 2005.167 2.857738 2004 2011 6 3.000000 0.6324555 2 4 6 1.000000 0.8944272 0 2 6
DORCHESTER 1 0 1 1 29212 3.009482 0.2831750 1 9 29212 425904.6 261471.4 0 1411800 29212 3517.647 2049.885 100 10423 29212 138801.47 115702.28 0 562400 29212 570512.4 326680.5 0 1713200 29212 6007.576 3652.608 0.000 18451.320 29212 1924.335 35.51287 1805 2022 29212 2003.973 8.644579 1945 2022 29212 3.605573 1.4174465 0 7 29212 1.499144 1.0505843 0 3 29212
EAST BOSTON 1 0 1 1 9951 3.030047 0.4148034 1 9 9951 462957.1 272373.6 0 1413300 9951 2142.705 1625.491 100 10358 9951 114900.59 118144.26 0 558500 9951 587681.2 334280.7 0 1714600 9951 6263.397 3750.557 0.000 18411.890 9951 1931.983 45.61467 1830 2022 9951 2004.525 8.762263 1950 2022 9951 3.150437 1.4207294 0 7 9951 1.465581 1.0411610 0 3 9951
HYDE PARK 1 0 1 1 9207 2.995981 0.1689739 2 9 9207 314932.1 175887.4 0 1412700 9207 4881.729 2392.979 100 10422 9207 156064.95 92243.90 0 558000 9207 477775.1 228762.2 0 1697900 9207 5056.070 2586.822 0.000 18285.410 9207 1937.738 32.50075 1810 2021 9207 2003.088 8.690376 1945 2021 9207 3.377104 1.1218745 0 7 9207 1.289454 0.8240628 0 3 9207
JAMAICA PLAIN 1 0 1 1 12104 2.994052 0.3702981 2 9 12104 520806.5 329397.4 0 1413700 12104 2882.166 2176.017 100 10414 12104 111333.38 162693.52 0 562500 12104 637322.9 420380.3 0 1715000 12104 6753.021 4621.264 0.000 18450.770 12104 1929.568 39.30802 1814 2021 12104 2003.425 10.101722 1945 2022 12104 3.023629 1.2324365 0 7 12104 1.345010 0.8614332 0 3 12104
MATTAPAN 1 0 1 1 4847 3.006602 0.1989390 2 9 4847 333446.9 204239.3 0 1413000 4847 4528.211 1927.585 115 10416 4847 146725.55 85171.72 0 549200 4847 483808.0 257296.4 0 1704500 4847 5083.583 2854.012 0.000 18154.610 4847 1933.737 29.02555 1820 2021 4847 2002.420 10.018523 1945 2021 4847 3.695069 1.2451567 0 7 4847 1.386218 0.9692975 0 3 4847
NEWTON 1 NA 1 1 1 3.000000 NA 3 3 1 414400.0 NA 414400 414400 1 5090.000 NA 5090 5090 1 215100.00 NA 215100 215100 1 629500.0 NA 629500 629500 1 6760.827 NA 6760.827 6760.827 1 1900.000 NA 1900 1900 1 2004.000 NA 2004 2004 1 4.000000 NA 4 4 1 2.000000 NA 2 2 1
READVILLE 1 0 1 1 2 3.000000 0.0000000 3 3 2 568500.0 112147.1 489200 647800 2 5079.500 4335.272 2014 8145 2 91000.00 128693.43 0 182000 2 719700.0 155704.9 609600 829800 2 3230.594 4568.749 0.000 6461.187 2 1960.000 86.26703 1899 2021 2 2004.000 0.000000 2004 2004 2 3.000000 0.0000000 3 3 2 0.000000 0.0000000 0 0 2
ROSLINDALE 1 0 1 1 9210 2.975462 0.3038920 1 9 9210 402995.8 215551.9 0 1399800 9210 4113.130 2230.198 113 10402 9210 142493.38 112938.21 0 545300 9210 548832.4 280573.1 0 1703806 9210 5870.318 3114.817 0.000 18273.070 9210 1931.258 33.58330 1810 2022 9210 2004.250 9.162412 1945 2022 9210 3.367427 1.1824828 0 7 9210 1.382193 0.8128541 0 3 9210
ROXBURY 1 0 1 1 6168 3.010052 0.3127932 1 9 6168 352143.0 279729.9 0 1413961 6168 2917.603 1969.704 107 10421 6168 103321.71 109489.80 0 561800 6168 468716.3 337425.4 0 1713100 6168 4743.391 3741.367 0.000 18299.890 6168 1928.715 42.71809 1821 2022 6168 2003.587 7.879435 1947 2022 6168 3.341278 1.1846411 0 7 6168 1.126135 1.0664532 0 3 6168
ROXBURY CROSSING 1 0 1 1 1832 3.008734 0.2743945 2 9 1832 511600.5 383344.3 0 1414100 1832 2532.479 1772.362 185 10412 1832 156740.27 187520.89 0 561700 1832 628109.6 429977.7 0 1713743 1832 6234.123 4929.640 0.000 18434.140 1832 1932.779 41.65142 1835 2021 1832 2003.543 8.788075 1950 2021 1832 3.173581 1.1102032 1 7 1832 1.162664 1.0723490 0 3 1832
SOUTH BOSTON 1 0 1 1 15439 3.026297 0.4889850 1 9 15439 540599.4 366421.9 0 1410600 15439 1623.014 1203.471 100 10418 15439 72626.13 132886.70 0 562400 15439 618473.1 420574.7 0 1715800 15439 6586.627 4566.800 0.000 18452.397 15439 1933.085 48.84513 1810 2021 15439 2004.314 7.956842 1950 2022 15439 2.658657 1.1137855 0 7 15439 1.296328 0.8623076 0 3 15439
WEST ROXBURY 1 0 1 1 10975 2.992893 0.1711478 2 8 10975 391766.7 229428.4 0 1414100 10975 4591.690 2337.698 110 10416 10975 179690.70 119162.34 0 561500 10975 579196.8 310047.0 0 1713000 10975 6118.703 3420.295 0.000 18309.547 10975 1937.725 29.00552 1804 2021 10975 2004.670 7.891918 1945 2022 10975 3.220046 0.9876027 0 7 10975 1.224966 0.7930328 0 3 10975
  • Descriptive Statistics for df grouped by HEAT TYPE

# Created a separate df with below heads for each
heat_type_grouped <- property_extract %>%
                         group_by(heat_type) %>%
                         summarise(across(where(is.numeric), 
                        list(
    Mean = ~ mean(., na.rm = TRUE),
    SD = ~ sd(., na.rm = TRUE),
    Min = ~ min(., na.rm = TRUE),
    Max = ~ max(., na.rm = TRUE),
    Total = ~ sum(!is.na(.))
  )))

# Replaced blank values with 'UNKNOWN'
heat_type_grouped$heat_type <- ifelse(heat_type_grouped$heat_type == "","UNKNOWN",  heat_type_grouped$heat_type)

kable(heat_type_grouped, 
               caption= "Descriptive Statistics for table grouped by HEAT TYPE",
               format = "html") %>%
  kable_styling(full_width = FALSE)
Descriptive Statistics for table grouped by HEAT TYPE
heat_type num_bldgs_Mean num_bldgs_SD num_bldgs_Min num_bldgs_Max num_bldgs_Total res_units_Mean res_units_SD res_units_Min res_units_Max res_units_Total bldg_value_Mean bldg_value_SD bldg_value_Min bldg_value_Max bldg_value_Total land_sf_Mean land_sf_SD land_sf_Min land_sf_Max land_sf_Total land_value_Mean land_value_SD land_value_Min land_value_Max land_value_Total total_value_Mean total_value_SD total_value_Min total_value_Max total_value_Total gross_tax_Mean gross_tax_SD gross_tax_Min gross_tax_Max gross_tax_Total yr_built_Mean yr_built_SD yr_built_Min yr_built_Max yr_built_Total yr_remodel_Mean yr_remodel_SD yr_remodel_Min yr_remodel_Max yr_remodel_Total bed_rms_Mean bed_rms_SD bed_rms_Min bed_rms_Max bed_rms_Total full_bth_Mean full_bth_SD full_bth_Min full_bth_Max full_bth_Total
UNKNOWN 1 0 1 1 47845 3.077793 0.7121207 1 9 47845 208733.7 315449.8 0 1414100 47845 2938.097 2059.0450 100 10423 47845 62072.745 122235.18 0 562500 47845 305120.3 395241.1 0 1715400 47845 2796.512 4227.403 0.00 18452.39 47845 1930.399 34.35975 1809 2022 47845 2003.195 7.982582 1945 2022 47845 2.999352 0.0362817 0 4 47845 0.2312049 0.4312148 0 3 47845
E - Electric 1 0 1 1 5903 3.000678 0.0552210 2 6 5903 533664.7 245569.6 0 1410200 5903 1387.873 1462.2430 200 10290 5903 38228.991 89599.94 0 554800 5903 588274.9 284122.7 0 1715200 5903 6288.321 3087.277 0.00 18442.73 5903 1928.151 43.37957 1820 2021 5903 1997.272 13.164108 1950 2022 5903 2.063866 1.2108262 0 7 5903 1.3023886 0.5666724 1 3 5903
F - Forced Hot Air 1 0 1 1 49801 2.998996 0.0502903 1 6 49801 609425.9 253320.9 0 1413700 49801 2242.838 1938.0174 130 10422 49801 74704.410 117741.70 0 560900 49801 710966.0 295216.7 0 1715500 49801 7614.558 3194.420 0.00 18453.47 49801 1946.643 52.32474 1805 2022 49801 2005.902 8.118993 1945 2022 49801 2.676091 1.3914254 0 7 49801 1.6844441 0.6932265 0 3 49801
N - None 1 0 1 1 89 3.000000 0.0000000 3 3 89 453149.9 206383.0 97100 1342000 89 4565.787 2110.0804 774 9600 89 202719.787 86422.87 0 418800 89 651698.9 228759.3 171700 1272500 89 6995.384 2457.658 1844.06 13666.65 89 1920.955 30.73712 1866 2016 89 2002.865 7.616058 1980 2021 89 3.955056 1.4764062 1 7 89 1.9662921 0.8181109 1 3 89
O - Other 1 0 1 1 41 3.000000 0.0000000 3 3 41 558768.3 232427.7 10300 1299900 41 3182.488 2329.2779 460 8888 41 123126.829 124580.42 0 385200 41 686251.2 240723.1 247100 1685100 41 7355.669 2590.357 2653.85 18097.98 41 1949.512 50.70953 1865 2020 41 2004.878 9.233621 1979 2021 41 2.853658 1.4415439 1 7 41 1.6341463 0.6616683 1 3 41
P - Heat Pump 1 0 1 1 4619 3.000000 0.0000000 3 3 4619 686830.3 278793.7 144900 1413900 4619 1284.755 833.0167 280 9473 4619 5796.932 36741.34 0 486300 4619 780936.9 335075.7 155200 1715000 4619 8359.741 3636.198 0.00 18441.66 4619 1950.080 51.18480 1840 2021 4619 2000.201 11.828843 1975 2022 4619 1.771379 0.8433594 0 7 4619 1.4602728 0.5769989 1 3 4619
S - Space Heat 1 0 1 1 677 2.998523 0.0666009 2 4 677 604080.3 250784.0 0 1357600 677 1986.380 1258.5560 280 9000 677 208835.430 110051.45 0 560800 677 808742.4 319176.5 0 1715000 677 8647.871 3450.326 0.00 18419.10 677 1900.363 16.71625 1835 2020 677 2002.171 9.294661 1950 2022 677 4.011817 1.6371724 0 7 677 2.3604136 0.8444555 0 3 677
W - Ht Water/Steam 1 0 1 1 71652 2.999442 0.0659858 1 9 71652 534516.6 227739.7 0 1414100 71652 3035.156 2350.4034 150 10422 71652 140417.826 133263.99 0 562500 71652 680449.6 286073.8 0 1715800 71652 7285.557 3093.880 0.00 18454.54 71652 1919.102 30.46559 1804 2021 71652 2000.741 11.954586 1945 2022 71652 3.088023 1.6611277 0 7 71652 1.6210573 0.7563859 0 3 71652
  • Descriptive Statistics for df grouped by PROPERTY VIEW

# Created a separate df with below heads for each
prop_view_grouped <- property_extract %>%
                         group_by(prop_view) %>%
                         summarise(across(where(is.numeric), 
                        list(
    Mean = ~ mean(., na.rm = TRUE),
    SD = ~ sd(., na.rm = TRUE),
    Min = ~ min(., na.rm = TRUE),
    Max = ~ max(., na.rm = TRUE),
    Total = ~ sum(!is.na(.))
  )))

# Replaced blank values with 'UNKNOWN'
prop_view_grouped$prop_view <- ifelse(prop_view_grouped$prop_view  == "","UNKNOWN",  prop_view_grouped$prop_view )

kable(prop_view_grouped, 
               caption= "Descriptive Statistics for table grouped by PROPERTY VIEW",
               format = "html") %>%
  kable_styling(full_width = FALSE)
Descriptive Statistics for table grouped by PROPERTY VIEW
prop_view num_bldgs_Mean num_bldgs_SD num_bldgs_Min num_bldgs_Max num_bldgs_Total res_units_Mean res_units_SD res_units_Min res_units_Max res_units_Total bldg_value_Mean bldg_value_SD bldg_value_Min bldg_value_Max bldg_value_Total land_sf_Mean land_sf_SD land_sf_Min land_sf_Max land_sf_Total land_value_Mean land_value_SD land_value_Min land_value_Max land_value_Total total_value_Mean total_value_SD total_value_Min total_value_Max total_value_Total gross_tax_Mean gross_tax_SD gross_tax_Min gross_tax_Max gross_tax_Total yr_built_Mean yr_built_SD yr_built_Min yr_built_Max yr_built_Total yr_remodel_Mean yr_remodel_SD yr_remodel_Min yr_remodel_Max yr_remodel_Total bed_rms_Mean bed_rms_SD bed_rms_Min bed_rms_Max bed_rms_Total full_bth_Mean full_bth_SD full_bth_Min full_bth_Max full_bth_Total
UNKNOWN 1 0 1 1 46762 3.083465 0.6995163 1 9 46762 213292.0 317372.6 0 1414100 46762 2921.539 2054.522 100 10423 46762 63454.38 123235.32 0 562500 46762 311818.0 396888.8 0 1715400 46762 2857.406 4252.462 0.000 18452.39 46762 1930.479 33.93058 1810 2022 46762 2003.173 8.065095 1945 2022 46762 2.999359 0.0403097 0 6 46762 0.2137205 0.4199347 0 2 46762
A - Average 1 0 1 1 110120 2.997675 0.1144094 1 9 110120 548101.0 239026.0 0 1414100 110120 2786.564 2259.478 130 10422 110120 117133.71 130564.01 0 562500 110120 672112.8 289630.8 0 1715800 110120 7196.673 3129.552 0.000 18454.54 110120 1928.304 41.18939 1804 2022 110120 2002.687 10.821214 1945 2022 110120 2.979831 1.5628980 0 7 110120 1.6295768 0.7360373 0 3 110120
E - Excellent 1 0 1 1 4383 2.997718 0.0522807 1 3 4383 681014.6 282790.8 0 1413900 4383 1707.764 1220.896 245 9960 4383 20888.25 82780.68 0 562400 4383 838833.1 343989.6 0 1715500 4383 8942.843 3800.562 0.000 18439.51 4383 1967.383 48.49512 1816 2020 4383 2003.522 10.207317 1950 2022 4383 2.066393 1.1432815 0 7 4383 1.7246178 0.6675318 1 3 4383
F - Fair 1 0 1 1 6113 2.998364 0.0922235 2 6 6113 528987.9 250489.5 0 1411300 6113 2092.653 1920.749 170 10402 6113 119164.92 133328.72 0 559400 6113 652085.9 303785.6 0 1715800 6113 7000.959 3274.319 0.000 18438.43 6113 1914.331 36.54574 1806 2021 6113 1999.968 12.735458 1950 2022 6113 2.712743 1.7095923 0 7 6113 1.5951251 0.7615746 0 3 6113
G - Good 1 0 1 1 12405 3.000726 0.1531654 1 9 12405 677576.9 273359.9 0 1413000 12405 1633.498 1549.584 175 10407 12405 32344.30 93186.52 0 561000 12405 775131.8 322636.8 0 1715500 12405 8303.874 3498.403 0.000 18448.10 12405 1947.536 52.46475 1809 2022 12405 2002.061 11.777985 1950 2022 12405 2.030149 1.1493140 0 7 12405 1.5378476 0.6228142 0 3 12405
P - Poor 1 0 1 1 534 2.998127 0.1145849 2 5 534 588746.9 275432.5 0 1406400 534 1813.234 1654.182 175 9078 534 151871.30 146943.01 0 560500 534 737490.1 342281.8 0 1683300 534 7884.944 3706.175 0.000 18078.64 534 1904.974 34.10462 1820 2018 534 1999.320 14.945695 1950 2022 534 2.962547 1.7300191 0 7 534 1.7134831 0.8139991 1 3 534
S - Special 1 0 1 1 310 3.000000 0.0000000 3 3 310 521921.0 146638.2 489200 1231100 310 2482.684 1054.538 839 7557 310 0.00 0.00 0 0 310 670428.4 210576.6 609600 1697200 310 7160.354 2372.915 6461.187 18447.02 310 1997.629 31.89994 1900 2020 310 2006.110 5.571039 1990 2022 310 2.541936 0.7652544 1 6 310 2.0290323 0.6984465 1 3 310
  • Descriptive Statistics for SAMPLE 1 grouped by PROPERTY VIEW

# Created a sample of 10,000 values randomly
sample1 <- sample_n(property_extract,10000)

# Created a separate df with below heads for each grouped by prop view
sample1_grouped <- sample1 %>%
                         group_by(prop_view) %>%
                         summarise(across(where(is.numeric), 
                        list(
    Mean = ~ mean(., na.rm = TRUE),
    SD = ~ sd(., na.rm = TRUE),
    Min = ~ min(., na.rm = TRUE),
    Max = ~ max(., na.rm = TRUE),
    Total = ~ sum(!is.na(.))
  )))

# Replaced blank values with 'UNKNOWN'
sample1_grouped$prop_view <- ifelse(sample1_grouped$prop_view  == "","UNKNOWN",  sample1_grouped$prop_view )

kable(sample1_grouped, 
               caption= "Descriptive Statistics for SAMPLE 1 grouped by PROPERTY VIEW",
               format = "html") %>%
  kable_styling(full_width = FALSE)
Descriptive Statistics for SAMPLE 1 grouped by PROPERTY VIEW
prop_view num_bldgs_Mean num_bldgs_SD num_bldgs_Min num_bldgs_Max num_bldgs_Total res_units_Mean res_units_SD res_units_Min res_units_Max res_units_Total bldg_value_Mean bldg_value_SD bldg_value_Min bldg_value_Max bldg_value_Total land_sf_Mean land_sf_SD land_sf_Min land_sf_Max land_sf_Total land_value_Mean land_value_SD land_value_Min land_value_Max land_value_Total total_value_Mean total_value_SD total_value_Min total_value_Max total_value_Total gross_tax_Mean gross_tax_SD gross_tax_Min gross_tax_Max gross_tax_Total yr_built_Mean yr_built_SD yr_built_Min yr_built_Max yr_built_Total yr_remodel_Mean yr_remodel_SD yr_remodel_Min yr_remodel_Max yr_remodel_Total bed_rms_Mean bed_rms_SD bed_rms_Min bed_rms_Max bed_rms_Total full_bth_Mean full_bth_SD full_bth_Min full_bth_Max full_bth_Total
UNKNOWN 1 0 1 1 2639 3.095112 0.7167251 2 9 2639 211855.6 309928.1 0 1410100 2639 2949.981 2053.2866 106 10388 2639 60444.51 120733.76 0 561500 2639 307873.4 388765.0 0 1713000 2639 2839.393 4243.914 0.000 18287.00 2639 1931.022 34.50532 1850 2022 2639 2003.361 7.876098 1950 2022 2639 3.000000 0.0000000 3 3 2639 0.2068966 0.4134923 0 2 2639
A - Average 1 0 1 1 6038 2.997847 0.1280400 2 8 6038 542108.1 234181.1 0 1411900 6038 2767.004 2243.9249 175 10416 6038 117157.88 130669.77 0 557100 6038 669010.2 291258.6 0 1713600 6038 7163.742 3151.849 0.000 18454.54 6038 1927.243 41.04945 1808 2021 6038 2002.518 10.827786 1945 2022 6038 2.998675 1.5724979 0 7 6038 1.6425969 0.7418849 0 3 6038
E - Excellent 1 0 1 1 233 3.000000 0.0000000 3 3 233 670653.2 286681.6 0 1407500 233 1623.532 1094.1985 372 8835 233 26077.25 96012.74 0 559300 233 867436.5 374418.7 0 1712000 233 9257.605 4091.659 0.000 18386.88 233 1967.137 54.15677 1816 2020 233 2002.906 9.706007 1960 2021 233 2.064378 1.1142384 0 6 233 1.7553648 0.6729240 1 3 233
F - Fair 1 0 1 1 347 2.994236 0.0758092 2 3 347 559253.9 281197.8 0 1409400 347 2093.118 1885.5264 207 9602 347 121443.59 133125.83 0 530800 347 679035.2 320242.8 0 1619700 347 7273.361 3432.970 0.000 17395.58 347 1914.236 38.97689 1825 2019 347 2000.726 12.793264 1950 2021 347 2.688761 1.6746893 0 7 347 1.5850144 0.7645356 1 3 347
G - Good 1 0 1 1 690 3.002899 0.2409289 2 9 690 674122.2 279496.6 0 1406500 690 1634.370 1572.6911 312 10365 690 28144.20 88922.42 0 510500 690 775186.2 334498.9 0 1702400 690 8308.192 3607.061 0.000 18283.78 690 1953.788 52.52591 1832 2020 690 2002.130 11.694854 1950 2022 690 2.008696 1.1630173 0 7 690 1.5449275 0.6249612 1 3 690
P - Poor 1 0 1 1 38 3.000000 0.0000000 3 3 38 642939.5 311308.2 157800 1379700 38 2235.579 1903.9094 290 7133 38 165447.37 135939.21 0 473600 38 788121.1 335012.7 214800 1487200 38 8462.159 3599.301 2306.950 15972.53 38 1909.658 36.35952 1840 2009 38 1999.816 16.261138 1950 2021 38 2.789474 1.5795637 1 6 38 1.7105263 0.7678645 1 3 38
S - Special 1 0 1 1 15 3.000000 0.0000000 3 3 15 538493.3 190912.3 489200 1228600 15 1949.200 706.3606 1219 3696 15 0.00 0.00 0 0 15 650866.7 159825.1 609600 1228600 15 6910.119 1738.704 6461.187 13195.16 15 1996.133 29.61628 1900 2020 15 2007.933 5.945787 2004 2019 15 2.133333 0.3518658 2 3 15 2.0000000 0.3779645 1 3 15
  • Creating 3 different Samples and summarizing them

# Summarizes statistics for Sample 1, including the mean values of building value, land value, gross tax, and bed rooms.
Sample1_summary<- data.frame(
  sample1%>%
    summarise(
          Data_Frame = "Sample 1",
      Mean_Building_Value = mean(bldg_value),
      Mean_Land_Value = mean(land_value),
      Mean_Gross_Tax = mean(gross_tax),
      Mean_Bed_Rms = mean(bed_rms)
    ))

# Creating sample 2 with same cols as above
sample2 <- sample_n(property_extract,10000)
Sample2_summary<- data.frame(
  sample2%>%
    summarise(
      Data_Frame = "Sample 2",
      Mean_Building_Value = mean(bldg_value),
      Mean_Land_Value = mean(land_value),
      Mean_Gross_Tax = mean(gross_tax),
      Mean_Bed_Rms = mean(bed_rms)
    ))

# Creating sample 3 with same cols as above
sample3 <- sample_n(property_extract,10000)
Sample3_summary<- data.frame(
  sample3%>%
    summarise(
           Data_Frame = "Sample 3",
      Mean_Building_Value = mean(bldg_value),
      Mean_Land_Value = mean(land_value),
      Mean_Gross_Tax = mean(gross_tax),
      Mean_Bed_Rms = mean(bed_rms)
    ))
  • Creating MAIN summary df from the cleaned dataset
Main_df <-  data.frame(
  property_df_cleaned%>%
    summarise(
          Data_Frame = "Main df",
      Mean_Building_Value = mean(bldg_value),
      Mean_Land_Value = mean(land_value),
      Mean_Gross_Tax = mean(gross_tax),
      Mean_Bed_Rms = mean(bed_rms)
    ))
  • Table 1: Binding rows to create one single dataframe
# Use binding function to make the comparison
combined_data <- bind_rows(Main_df, Sample1_summary, Sample2_summary, Sample3_summary)

# Print the combined data frame with a caption using kable
kable(combined_data, caption = "Table 1: Mean of Main dataframe combined with 3 Samples",
      format = "html") %>%
  kable_styling(full_width = FALSE)
Table 1: Mean of Main dataframe combined with 3 Samples
Data_Frame Mean_Building_Value Mean_Land_Value Mean_Gross_Tax Mean_Bed_Rms
Main df 472968.9 95048.67 6187.053 2.887658
Sample 1 468031.2 94083.58 6158.658 2.896100
Sample 2 478639.1 94017.67 6270.704 2.867300
Sample 3 478344.7 94991.95 6229.032 2.877500

The descriptive statistics presented in Table 1 provide insights into the central tendency and variability of Building Value, Land Value, Gross Tax, and Bed Rooms for the main data frame (Main df) and three samples (Sample 1, Sample 2, Sample 3). The “Mean” column indicates the average values for each variable in the respective datasets. Comparing the means across the main dataset and samples allows us to understand that the mean of sampled data aligns with the mean of main dataset.

There is not much difference on comparing which suggests that the sampled data is relatively representative of the main dataset for the considered variables (Building Value, Land Value, Gross Tax, and Bed Rooms). This suggests that the random sampling process for creating Sample 1, Sample 2, and Sample 3 has captured the average values of the variables in a way that is consistent with the overall dataset.


PART 2 - Visualiztion

  • Jitter Chart:

    A jitter chart is used to visualize the distribution of a dataset when there is overlap in values. It is particularly useful in scenarios where there are discrete categories on one axis and continuous data on the other. Jitter chart avoids over plotting and overlapping.

  • Boxplots:

    A boxplot is a graphical representation that displays the distribution of a dataset and provides a visual summary of its key statistical measures. Box plot is used for detecting outliers. Boxplots allow for a quick visual inspection of the spread of the data.

  • Jitter plot for Land values
# Creating a jitter plot for Land values
ggplot(property_df_cleaned, aes(x = heat_type, y = land_value)) +
  geom_jitter(width = 0.2, height = 0, color = "blue", alpha = 0.7, size= 0.25) +
  labs(title = "Jitter Plot for Land Values by Heat Type",
       x = "Heat Type",
       y = "Land Value") +
  theme_minimal()+
   theme(axis.text.x = element_text(angle = 30, hjust = 1))+
   scale_y_continuous( 
    breaks = seq(0,500000, by=50000),
    limits = c(0,600000))

# Changing axis labels to non-scientific notation
options(scipen = 999)

The chart visualizes the distribution of land_value across different heat_type categories using jittered points.The land_value is represented by the y-axis, while the heat_type is represented by the x-axis. We don’t have adequate information about the sort of heat used. Among the available data, “hot water steam” is the most frequently used heat type followed by “forced hot air,” the next most used heat type.

  • Combined Scatterplot
par(mfrow = c(1, 2)) 

# Scatterplot 1: land_value vs. land_sf
plot(property_extract$land_value, property_extract$land_sf, 
     main = "Scatterplot: bed_rms vs. full_bth",
     xlab = "Land Value", ylab = "Land Area", col = "blue", cex= 0.25)

# Scatterplot 2: yr_built vs. yr_remodel
plot(property_extract$yr_built, property_extract$yr_remodel, 
     main = "Scatterplot: yr_built vs. yr_remodel ",
     xlab = "yr_built", ylab = "yr_remodel", col = "red", cex= 0.25)

# Reset the layout to default (1x1)
par(mfrow = c(1, 1))

Dispplayed two scatterplot side by side using par(). * First, the left scatterplot displays the relationship between land_value and land_sf. The plot illustrates the distribution of land values concerning land area. * The right scatterplot compares yr_built and yr_remodel,providing insight into the relationship between the year a property was built and the year it was remodeled.

  • Box Plot for Property View
# Create a basic plot
boxplot(bldg_value ~ prop_view, 
        data = property_extract, 
        main = "Box Plot for Property View",
        col = "lightblue",  
        border = "blue", 
        cex= 0.5,
        notch = FALSE,     
        notchwidth = 0.25,  
        xlab = "prop_view",
        ylab = "bldg_value",
        las= 2
)

The box plot above illustrates the distribution of building values (bldg_value) across different property views (prop_view). The horizontal line inside each box represents the median building value for the corresponding property view category. The boxes indicate the interquartile range (IQR), representing the middle 50% of the data.

  • Scatter plot between Land Area in sq_feet and Land Value
ggplot(property_extract, aes(x = total_value , y = gross_tax)) +
  geom_point(size=0.0025) +
  labs(title = "Relationship between Land Area in sq_feet and Land Value",
  x= "Land Area in sq",
  y= "Land Value in $") 

The scatter plot above visualizes the relationship between land area (in square feet) and land value (in dollars). The scatter plot shows an upward/positive trend between land area and land value. Positive trends indicates that the land value increases with an increase in land area. Land value and land area are directly proportional for this dataset.

  • Scatter plot between Land Area in sq_feet and Land Value
V5<- data.frame(property_extract%>%
                                group_by(city)%>%
                                summarise(total =n()))

V5$city <- ifelse(V5$city == "",NA, V5$city)
V5<- na.omit(V5)

V5_plot <- ggplot(V5, aes(x = total, y= reorder (city,total), fill= city)) +
  geom_bar(stat="identity") +
  labs(title = "Total parcels in each city", x = "Total", y = "City")

ggplotly(V5_plot) %>% layout(showlegend = FALSE)

The above graph depicts the total number of parcels in each city. The above graph is plotted using ggplot. According to the chart above, Boston has the highest number of parcels that is 47,104 and Newton has the lowest number of parcels which equals to 1. There are a total of 19 cities in the dataset.